tidymodels 2 with bootstrap resamples and workflow

R
Author

Tony Duan

Published

October 12, 2023

1 package

Code
library(tidyverse)
library(ggplot2)
library(tidymodels)

2 data

from https://www.kaggle.com/c/titanic/data

Code
pred <- c("Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked", "title")
train_df_raw <- read_csv('data/train.csv')
test_df_raw <- read_csv('data/test.csv')
glimpse(train_df_raw)
Rows: 891
Columns: 12
$ PassengerId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
$ Survived    <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
$ Pclass      <dbl> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
$ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
$ Sex         <chr> "male", "female", "female", "female", "male", "male", "mal…
$ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
$ SibSp       <dbl> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
$ Parch       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
$ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
$ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
$ Cabin       <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C…
$ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…

2.1 train data

Code
train_df=train_df_raw %>%
  mutate(Survived=as.factor(Survived),
         title = str_trim(str_replace(str_extract(Name, ", [A-Z]+[A-Za-z.]*[:space:]+"), ",", ""))
  )

#train_df=train_df %>% select(c(all_of(pred),"Survived")) 

dim(train_df)
[1] 891  13
Code
train_df %>% count(Survived)
# A tibble: 2 × 2
  Survived     n
  <fct>    <int>
1 0          549
2 1          342
Code
342/(549+342)
[1] 0.3838384

2.2 test data

Code
test_df=test_df_raw %>%
  mutate(
         title = str_trim(str_replace(str_extract(Name, ", [A-Z]+[A-Za-z.]*[:space:]+"), ",", ""))
  )

#test_df=test_df %>% select(c(all_of(pred))) 
dim(test_df)
[1] 418  12

2.3 bootstrap (re)samples for model selection

Code
set.seed(2022)
titanic_folds <- bootstraps(data = train_df, 
                            times = 10)
titanic_folds
# Bootstrap sampling 
# A tibble: 10 × 2
   splits            id         
   <list>            <chr>      
 1 <split [891/328]> Bootstrap01
 2 <split [891/338]> Bootstrap02
 3 <split [891/318]> Bootstrap03
 4 <split [891/316]> Bootstrap04
 5 <split [891/328]> Bootstrap05
 6 <split [891/317]> Bootstrap06
 7 <split [891/330]> Bootstrap07
 8 <split [891/339]> Bootstrap08
 9 <split [891/322]> Bootstrap09
10 <split [891/317]> Bootstrap10

3 model

3.1 recipe

Code
# declare recipe
titanic_recipe <- 
  recipe(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, 
         data = train_df) %>% # keep variables we want
  step_impute_median(Age,Fare) %>% # imputation
  step_impute_mode(Embarked) %>% # imputation
  step_mutate_at( Pclass, Sex, Embarked, fn = factor) %>% # make these factors
  step_mutate(Travelers = SibSp + Parch + 1) %>% # new variable
  step_rm(SibSp, Parch) %>% # remove variables
  step_dummy(all_nominal_predictors()) %>% # create indicator variables
   # normalize numerical variables 
  step_normalize(all_numeric_predictors())  %>% prep()


summary(titanic_recipe)
# A tibble: 9 × 4
  variable   type      role      source  
  <chr>      <list>    <chr>     <chr>   
1 Age        <chr [2]> predictor original
2 Fare       <chr [2]> predictor original
3 Survived   <chr [3]> outcome   original
4 Travelers  <chr [2]> predictor derived 
5 Pclass_X2  <chr [2]> predictor derived 
6 Pclass_X3  <chr [2]> predictor derived 
7 Sex_male   <chr [2]> predictor derived 
8 Embarked_Q <chr [2]> predictor derived 
9 Embarked_S <chr [2]> predictor derived 
Code
juice_titanic_recipe=juice(titanic_recipe)

3.2 model

Code
# logistic regression
titanic_glm_spec <- 
  logistic_reg() %>% # model
  set_engine('glm') %>%  # package to use
  set_mode('classification') # choose one of two: classification vs regresson

# random forest
titanic_rf_spec <-  
  rand_forest(trees = 200) %>% # algorithm speicfic argument:200 trees
  set_engine('ranger') %>% 
  set_mode('classification')

# svm
titanic_svm_spec <-  
  svm_rbf() %>% # rbf - radial based
  set_engine('kernlab') %>% 
  set_mode('classification')

3.3 workflow

Code
# logistic regression
doParallel::registerDoParallel() # resample fitting is embarrasingly parrallel problem

titanic_glm_wf <- 
  workflow() %>% 
  add_recipe(titanic_recipe) %>% 
  add_model(titanic_glm_spec) 

# random forest
doParallel::registerDoParallel()
titanic_rf_wf <- 
  workflow() %>% 
  add_recipe(titanic_recipe) %>% 
  add_model(titanic_rf_spec) 
  
# svm
doParallel::registerDoParallel()
titanic_svm_wf <- 
  workflow() %>% 
  add_recipe(titanic_recipe) %>% 
  add_model(titanic_svm_spec) 

3.4 trainning

Code
glm_model_fit=titanic_glm_wf%>% fit_resamples(titanic_folds)

rf_model_fit=titanic_rf_wf%>% fit_resamples(titanic_folds)

svm_model_fit=titanic_svm_wf%>% fit_resamples(titanic_folds)

4 result

Code
collect_metrics(glm_model_fit) 
# A tibble: 2 × 6
  .metric  .estimator  mean     n std_err .config             
  <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
1 accuracy binary     0.793    10 0.00286 Preprocessor1_Model1
2 roc_auc  binary     0.842    10 0.00640 Preprocessor1_Model1
Code
collect_metrics(rf_model_fit) 
# A tibble: 2 × 6
  .metric  .estimator  mean     n std_err .config             
  <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
1 accuracy binary     0.822    10 0.00456 Preprocessor1_Model1
2 roc_auc  binary     0.865    10 0.00554 Preprocessor1_Model1
Code
collect_metrics(svm_model_fit) 
# A tibble: 2 × 6
  .metric  .estimator  mean     n std_err .config             
  <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
1 accuracy binary     0.811    10 0.00472 Preprocessor1_Model1
2 roc_auc  binary     0.833    10 0.00331 Preprocessor1_Model1

4.1 last fit

It seems that Random Forest is the winner with 82% accuracy and ROCAUC of 86.5. We use it as a final fit to the whole training data.

Code
#random forest workflow
titanic_rf_last_wf <- 
  workflow() %>% 
  add_recipe(titanic_recipe) %>% 
  add_model(titanic_rf_spec)
Code
# last fit
final_fit <- 
  fit(object = titanic_rf_last_wf, 
      data = train_df)
Code
#result
final_fit %>% 
  extract_recipe(estimated = T)

5 predictions

Code
test_pred <-
  final_fit %>%predict(test_df) 
Code
final_result=test_pred %>% bind_cols(test_df) %>% 
  select(PassengerId, .pred_class) %>%
  rename(Survived=.pred_class)
Code
head(final_result)
# A tibble: 6 × 2
  PassengerId Survived
        <dbl> <fct>   
1         892 0       
2         893 0       
3         894 0       
4         895 0       
5         896 1       
6         897 0       
Code
final_result %>% count(Survived)
# A tibble: 2 × 2
  Survived     n
  <fct>    <int>
1 0          295
2 1          123
Code
119/(119+299)
[1] 0.284689

6 Reference

https://www.kaggle.com/c/titanic/data

https://rpubs.com/tsadigov/titanic_tidymodels